Load Dataset and pre-processing (Yongye)
education <- read.csv("MA_Public_Schools_2017.csv")
filtered_education <-
education %>%
# filter out by number of enrollment
filter(TOTAL_Enrollment >= 800) %>%
# filter out only high school
filter(Grade == "09,10,11,12") %>%
# filter out school that has AP exams
filter(AP_Test.Takers != "") %>%
# select columns that do not have na values in all row
select_if(~ !all(is.na(.))) %>%
# Remove columns that do not have any meaning to our investigation
#select(-District.Code, -PK_Enrollment, -K_Enrollment, -X1_Enrollment, -X2_Enrollment, -X3_Enrollment, -X4_Enrollment, -X5_Enrollment, -X6_Enrollment, -X7_Enrollment, -X8_Enrollment, -Fax, -Phone, -Function, -Address.2) %>%
# sort the name of town
arrange(Town)
write.csv(filtered_education, "filter_education.csv")
COL_NAMES <- colnames(filtered_education) # get all column names
TOWN_NAMES <- filtered_education$Town # get all city names
Web Scraping (Yongye)
# read and store the html file
town_info <- readLines(paste(getwd(), "/wikipedia.html", sep = ""))
# collapse into a single string so we can better extract the data out
town_info_str <- paste(town_info, collapse = "")
# make the pattern to extract the table
wiki_table_pattern <- "<table class=\"wikitable sortable\">(.*)</table>"
# Extract the table html using the pattern
table_html <- str_match(town_info_str, wiki_table_pattern)[, 2]
# re-parse the HTML content to the readable table string
html <- read_html(table_html)
# convert the HTML table string to R dataframe
table <- html %>% html_table()
# get the first table
table_data <- table[[1]]
# get all name of columns to iterate over the column
TABLE_COL_NAMES <- colnames(table_data)
# iterate over each column
for (i in 1: length(TABLE_COL_NAMES)) {
table_data[i] <- gsub("\\$", "", table_data[[TABLE_COL_NAMES[i]]]) # Remove dollar signs
table_data[i] <- gsub(",", "", table_data[[TABLE_COL_NAMES[i]]]) # Remove comma
table_data[i] <- gsub("\\+", "", table_data[[TABLE_COL_NAMES[i]]]) # Remove plus
}
More Web scraping (Yongye)
count <- 0
median_family_income <- c()
households <- c()
population <- c()
missing_town <- c()
# NOW iterate over each row in filtered_education and add
# Median household income, Households, and Population
for (i in 1 : nrow(filtered_education)) {
# get one row from filtered education
school_row <- filtered_education[i, ]
# all the town name
town_name <- school_row$Town
# find the match row
match_table_data <- table_data %>% filter(Municipality == town_name)
# increment the count if there is any result from the dataset
# extract the data (it is where magic happens)
if (nrow(match_table_data) > 0) {
median_family_income <- c(median_family_income, (as.double(match_table_data$`Median family income`)))
households <- c(households, as.double(match_table_data$Households))
population <- c(population, as.double(match_table_data$Population))
count <- count + 1
} else {
median_family_income <- c(median_family_income, NA)
households <- c(households, NA)
population <- c(population, NA)
# add missing town to the vector
missing_town <- c(missing_town, town_name)
}
}
cat("Here are a list of towns that we could not find the median house income, population, and households:\n", missing_town, "\n")
## Here are a list of towns that we could not find the median house income, population, and households:
## Brighton Charlestown East Boston Hathorne Newton Centre Newtonville North Chelmsford North Eastham North Easton South Easton
# Final: add scrap data to our filtered education dataset
filtered_education <-
filtered_education %>%
mutate(median_family_income = median_family_income) %>%
mutate(households = households) %>%
mutate(population = population)
Regression (Yongye)
lm(formula = MCAS_10thGrade_English_CPI ~ median_family_income, data = filtered_education)
##
## Call:
## lm(formula = MCAS_10thGrade_English_CPI ~ median_family_income,
## data = filtered_education)
##
## Coefficients:
## (Intercept) median_family_income
## 9.295e+01 3.448e-05
coefficient1 <- cor.test(
filtered_education$median_family_income,
filtered_education$MCAS_10thGrade_English_CPI)
print(coefficient1)
##
## Pearson's product-moment correlation
##
## data: filtered_education$median_family_income and filtered_education$MCAS_10thGrade_English_CPI
## t = 7.0585, df = 116, p-value = 1.321e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4077851 0.6631900
## sample estimates:
## cor
## 0.5481405
# p-value is 1.321e-10
coefficient2 <- cor.test(
filtered_education$median_family_income,
filtered_education$MCAS_10thGrade_Math_CPI)
print(coefficient2)
##
## Pearson's product-moment correlation
##
## data: filtered_education$median_family_income and filtered_education$MCAS_10thGrade_Math_CPI
## t = 8.6142, df = 116, p-value = 4.087e-14
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5003328 0.7236593
## sample estimates:
## cor
## 0.6246032
# p-value is 4.087e-14
coefficient3 <- cor.test(
filtered_education$median_family_income,
filtered_education$Average.SAT_Math)
print(coefficient3)
##
## Pearson's product-moment correlation
##
## data: filtered_education$median_family_income and filtered_education$Average.SAT_Math
## t = 10.817, df = 116, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6054170 0.7884001
## sample estimates:
## cor
## 0.7086278
# p-value < 2.2e-16
coefficient4 <- cor.test(
filtered_education$median_family_income,
filtered_education$ratio)
print(coefficient4)
##
## Pearson's product-moment correlation
##
## data: filtered_education$median_family_income and filtered_education$ratio
## t = 8.7395, df = 115, p-value = 2.218e-14
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5085269 0.7296020
## sample estimates:
## cor
## 0.6317411
Bootstrap (Elizabeth)
N_bts <- 128
M <- 500 # size of bootstrap sample
regressions <- array(dim = c(M,8))
for (m in 1:M) {
idx <- sample(N_bts,replace=TRUE)
coeffs1 <- cor.test(filtered_education$median_family_income[idx],
filtered_education$MCAS_10thGrade_English_CPI[idx])
coeffs2 <- cor.test(filtered_education$median_family_income[idx],
filtered_education$MCAS_10thGrade_Math_CPI[idx])
coeffs3 <- cor.test(filtered_education$median_family_income[idx],
filtered_education$Average.SAT_Math[idx])
coeffs4 <- cor.test(filtered_education$median_family_income[idx],
filtered_education$ratio[idx])
#coeffs <- c(coeffs1$statistic, coeffs1$p.value, coeffs2$statistic, coeffs2$p.value, coeffs3$statistic, coeffs3$p.value, coeffs4$statistic, coeffs4$p.value)
coeffs <- c(coeffs1$estimate, coeffs1$p.value, coeffs2$estimate, coeffs2$p.value, coeffs3$estimate, coeffs3$p.value, coeffs4$estimate, coeffs4$p.value)
regressions[m,] <- coeffs
}
# mean(regressions[,1])
# sd(regressions[,1])
# hist(regressions[,1])
colMeans(regressions)
## [1] 5.563304e-01 1.936501e-08 6.325819e-01 9.584248e-10 7.140112e-01
## [6] 4.210943e-10 6.350941e-01 3.195867e-06
#ggplot(data = regressions) +
#geom_histogram(mapping = aes(x=regressions[,1]))
hist(regressions[,1], breaks = 50, main = "Bootstrapped English MCAS CPI", xlab = "MCAS 10th Grade English CPI" )

hist(regressions[,3],breaks = 50, main = "Bootstrapped Math MCAS CPI", xlab = "MCAS 10th Grade Math CPI" )

hist(regressions[,5], breaks = 50, main = "Bootstrapped Math SAT", xlab = "Average Math SAT Score" )

hist(regressions[,7], breaks = 50, main = "Bootstrapped AP Ratio", xlab = "Ratio of AP Exams passed to taken" )

Monte Carlo (Elizabeth)
#correlation test:
# coefficient1 <- cor.test(
# filtered_education$median_family_income,
# filtered_education$MCAS_10thGrade_English_CPI)
# print(coefficient1)
#default conf level = 0.95
#SAT looks normal enough; MCAS is a percentage, probably model with beta
mean(filtered_education$median_family_income, na.rm = TRUE)
## [1] 125715.1
#125715.1
sd(filtered_education$median_family_income, na.rm = TRUE)
## [1] 49917.66
#49917.66
#doesn't look normal, but probably truncated normal bc we're excluding the poorest schools
mean(filtered_education$Average.SAT_Reading)
## [1] 505.1875
#505.1875
sd(filtered_education$Average.SAT_Reading)
## [1] 57.98381
#57.98381
mean(filtered_education$Average.SAT_Math)
## [1] 519.3125
#519.3125
sd(filtered_education$Average.SAT_Math)
## [1] 59.18804
#59.18804
#generate synthetic uncorrelated data:
N <- 125
fake_income <- rnorm(N, mean = 125700, sd = 50000)
fake_SAT_Reading <- rnorm(N, mean = 505, sd = 58)
cor.test(fake_income, fake_SAT_Reading)
##
## Pearson's product-moment correlation
##
## data: fake_income and fake_SAT_Reading
## t = -1.464, df = 123, p-value = 0.1458
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.29958915 0.04579338
## sample estimates:
## cor
## -0.1308665
#generate synthetic correlated data:
rho = 0.6
fake_data = mvrnorm(N, mu = c(125.7, 505), Sigma = matrix(c(50^2, rho*50*58, rho*50*58, 58^2), nrow = 2))
plot(fake_data[,1], fake_data[,2])

#names(fake_data) <- c("Income", "SAT_Reading")
fake_data_fn <- function(N = 125, rho = 0) {
fake_data = mvrnorm(N, mu = c(125.7, 505), Sigma = matrix(c(50^2, rho*50*58, rho*50*58, 58^2), nrow = 2))
return(fake_data)
}
fake_data_fn()
## [,1] [,2]
## [1,] 201.53556 504.0961
## [2,] 91.23791 519.4790
## [3,] 73.95449 514.2527
## [4,] 101.67969 571.0033
## [5,] 46.42053 596.1686
## [6,] 78.95813 424.5940
## [7,] 155.09828 608.1990
## [8,] 115.27316 531.4654
## [9,] 98.83906 355.2304
## [10,] 160.06125 524.4903
## [11,] 133.70320 501.8270
## [12,] 228.35652 496.8228
## [13,] 101.75752 535.4802
## [14,] 137.71455 421.5071
## [15,] 176.75170 505.6944
## [16,] 109.77675 485.8050
## [17,] 20.02915 527.1698
## [18,] 68.02596 459.6499
## [19,] 116.88875 481.1319
## [20,] 132.79750 466.0799
## [21,] 207.31798 551.2596
## [22,] 147.26535 430.0444
## [23,] 61.40281 422.2780
## [24,] 217.29016 563.6706
## [25,] 153.82078 590.9327
## [26,] 24.41159 499.6297
## [27,] 151.30538 518.8112
## [28,] 199.99672 645.8461
## [29,] 85.13185 436.6094
## [30,] 153.25784 576.0191
## [31,] 76.90077 583.0119
## [32,] 228.33153 483.0252
## [33,] 82.55420 507.9155
## [34,] 217.02929 494.9489
## [35,] 108.43334 532.5629
## [36,] 87.38282 608.2535
## [37,] 81.68364 454.4501
## [38,] 152.32573 588.8864
## [39,] 51.30555 448.4601
## [40,] 165.86390 535.7720
## [41,] 93.10929 486.9911
## [42,] 117.98721 483.4511
## [43,] 120.44443 401.8527
## [44,] 80.80998 513.6890
## [45,] 141.73842 554.0036
## [46,] 114.88027 485.9295
## [47,] 142.00610 429.3549
## [48,] 236.25148 463.2372
## [49,] 206.81265 577.5181
## [50,] 98.82312 570.7474
## [51,] 154.16731 546.6615
## [52,] 55.71727 450.6202
## [53,] 133.43579 415.2781
## [54,] 99.33509 413.2035
## [55,] 222.85407 347.4185
## [56,] 113.73420 546.2224
## [57,] 132.33901 589.6817
## [58,] 89.93902 480.8032
## [59,] 131.20199 447.2035
## [60,] 130.97002 546.6444
## [61,] 253.25685 477.2958
## [62,] 119.76279 465.7836
## [63,] 101.63740 455.9254
## [64,] 160.73214 527.0390
## [65,] 150.45324 578.5156
## [66,] 157.29030 501.8874
## [67,] 65.77391 464.2555
## [68,] 153.27628 481.7904
## [69,] 140.68252 438.8938
## [70,] 217.81903 624.3931
## [71,] 23.47501 494.3379
## [72,] 91.97790 491.5116
## [73,] 190.40293 545.5302
## [74,] 117.28794 490.7586
## [75,] 160.58182 489.5027
## [76,] 141.47024 465.2076
## [77,] 194.08552 487.2647
## [78,] 160.72183 437.3347
## [79,] 128.57120 488.9586
## [80,] 136.80122 505.6750
## [81,] 41.77936 603.6182
## [82,] 219.96841 384.1799
## [83,] 168.03888 541.3280
## [84,] 79.90420 476.3388
## [85,] 33.90753 411.8841
## [86,] 229.55634 497.9937
## [87,] 82.74697 525.3981
## [88,] 135.00875 470.8404
## [89,] 95.40809 523.5146
## [90,] 89.87831 426.8903
## [91,] 71.01324 444.6455
## [92,] 174.80669 489.8343
## [93,] 109.03576 592.3032
## [94,] 167.34175 525.7670
## [95,] 154.09206 547.1457
## [96,] 256.84290 500.2228
## [97,] 84.64323 545.5630
## [98,] 257.07186 550.8986
## [99,] 28.71331 508.8649
## [100,] 142.08475 459.3456
## [101,] 70.24885 513.5279
## [102,] 51.37895 560.1126
## [103,] 117.12406 553.0747
## [104,] 216.67581 568.8323
## [105,] 101.42417 480.3735
## [106,] 132.65701 485.1333
## [107,] 112.44454 527.7250
## [108,] 51.99516 349.9349
## [109,] 193.86586 488.2034
## [110,] 77.98248 512.0402
## [111,] 143.16796 438.9973
## [112,] 125.49265 592.8069
## [113,] 133.86418 452.1692
## [114,] 146.14139 470.4050
## [115,] 87.92805 420.2985
## [116,] 58.92689 443.3537
## [117,] 114.31148 548.3855
## [118,] 106.57653 488.5927
## [119,] 38.65270 461.9122
## [120,] 152.65574 451.6974
## [121,] 172.27661 500.9676
## [122,] 124.75262 495.7891
## [123,] 20.21261 432.3381
## [124,] 123.11371 548.1120
## [125,] 86.36111 528.1756
cor_test_fn <- function(df, alpha = 0.05) {
cor_test_result <- cor.test(df[,1],df[,2], conf.level = 1 - alpha)
return(cor_test_result$p.value < alpha)
}
cor_test_fn(fake_data_fn())
## [1] FALSE
power_test_fn <- function(S = 500, N = 125, rho = 0, alpha = 0.05) {
dfs <- replicate(S, cor_test_fn(fake_data_fn(N, rho), alpha))
return(mean(dfs))
}
power_test_fn()
## [1] 0.046
#Power Study
rho_vec <- seq(-1,1,0.1)
power_vec <- lapply(rho_vec, power_test_fn, S = 500, N = 125, alpha = 0.05)
plot(x = rho_vec, y = power_vec)

Scratchwork (Elizabeth)
ggplot(data=filtered_education) +
geom_point(mapping = aes(x=Average.SAT_Math, y=X..MCAS_10thGrade_Math_P.A, size=TOTAL_Enrollment, color=X..Economically.Disadvantaged))

ggplot(data=filtered_education) +
geom_point(mapping = aes(x=Average.SAT_Writing, y=X..MCAS_10thGrade_English_P.A, size=TOTAL_Enrollment, color=X..Economically.Disadvantaged))

hist(filtered_education$X..Economically.Disadvantaged, breaks = 30)

hist(filtered_education$Average.SAT_Reading)

hist(filtered_education$Average.SAT_Math)

hist(filtered_education$X..MCAS_10thGrade_Math_P.A, breaks = 30)

hist(filtered_education$X..MCAS_10thGrade_English_P.A, breaks = 30)

hist(filtered_education$median_family_income, breaks = 30)

hist(filtered_education$MCAS_10thGrade_English_CPI, breaks = 30)

hist(filtered_education$MCAS_10thGrade_Math_CPI, breaks = 30)

g <- ggplot(data = filtered_education,
mapping = aes(
x = log(median_family_income),
y = MCAS_10thGrade_Math_CPI,
text = sprintf("School = ", School.Name)
)) +
geom_point(na.rm = TRUE) +
geom_smooth(method = "loess", color = "red") +
geom_text(label=filtered_education$School.Name) +
labs(
x = "Median family income in log-dollar",
y = "10th grade MCAS Math CPI Score",
title = "log median family income vs MCAS Math CPI Score"
) +
theme(plot.title = element_text(hjust = 0.5))
g
## Warning in sprintf("School = ", School.Name): one argument not used by format
## 'School = '
## Warning in sprintf("School = ", School.Name): one argument not used by format
## 'School = '
## Warning in sprintf("School = ", School.Name): one argument not used by format
## 'School = '
## Warning in sprintf("School = ", School.Name): one argument not used by format
## 'School = '
## Warning in sprintf("School = ", School.Name): one argument not used by format
## 'School = '
## Warning in sprintf("School = ", School.Name): one argument not used by format
## 'School = '
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 10 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 10 rows containing missing values (`geom_text()`).

ggplot(data=filtered_education) +
geom_point(mapping = aes(x=Average.SAT_Math, y=MCAS_10thGrade_Math_CPI, size=TOTAL_Enrollment, color=median_family_income))

Maekala
# read in tidied data
education <- read.csv("filter_education.csv")
# assigning schools into groups by graduation rate (assuming the goal of each school is a 100% graduation rate)
bottom_25_percent <- subset(education,
education$X..Graduated <= quantile(education$X..Graduated, probs = 0.25))
middle_50_percent <- subset(education,
education$X..Graduated > quantile(education$X..Graduated, probs = 0.25) &
X..Graduated <= quantile(education$X..Graduated, probs = 0.75))
top_25_percent <- subset(education,
education$X..Graduated > quantile(education$X..Graduated, probs = 0.75))
# explore the top correlated columns within each subgroup
# bottom quartile
bottom_25_percent <- bottom_25_percent[sapply(bottom_25_percent, is.numeric)]
b_correlation_matrix <- cor(bottom_25_percent)
## Warning in cor(bottom_25_percent): the standard deviation is zero
b_highly_correlated <- which(upper.tri(b_correlation_matrix, diag = TRUE) & b_correlation_matrix > 0.8, arr.ind = TRUE)
b_most_correlated <- data.frame(Column1 = rownames(b_correlation_matrix)[b_highly_correlated[, 1]],
Column2 = colnames(b_correlation_matrix)[b_highly_correlated[, 2]],
Correlation = b_correlation_matrix[b_highly_correlated])
b_most_correlated <- b_most_correlated[b_most_correlated$Column1 != b_most_correlated$Column2, ]
b_top_50 <- b_most_correlated[order(-b_most_correlated$Correlation), ][1:50, ]
# middle 50 percent
middle_50_percent <- middle_50_percent[sapply(middle_50_percent, is.numeric)]
m_correlation_matrix <- cor(middle_50_percent)
## Warning in cor(middle_50_percent): the standard deviation is zero
m_highly_correlated <- which(upper.tri(m_correlation_matrix, diag = TRUE) & m_correlation_matrix > 0.8, arr.ind = TRUE)
m_most_correlated <- data.frame(Column1 = rownames(m_correlation_matrix)[m_highly_correlated[, 1]],
Column2 = colnames(m_correlation_matrix)[m_highly_correlated[, 2]],
Correlation = m_correlation_matrix[m_highly_correlated])
m_most_correlated <- m_most_correlated[m_most_correlated$Column1 != m_most_correlated$Column2, ]
m_top_50 <- m_most_correlated[order(-m_most_correlated$Correlation), ][1:50, ]
# top quartile
top_25_percent <- top_25_percent[sapply(top_25_percent, is.numeric)]
t_correlation_matrix <- cor(top_25_percent)
## Warning in cor(top_25_percent): the standard deviation is zero
t_highly_correlated <- which(upper.tri(t_correlation_matrix, diag = TRUE) & t_correlation_matrix > 0.8, arr.ind = TRUE)
t_most_correlated <- data.frame(Column1 = rownames(t_correlation_matrix)[t_highly_correlated[, 1]],
Column2 = colnames(t_correlation_matrix)[t_highly_correlated[, 2]],
Correlation = t_correlation_matrix[t_highly_correlated])
t_most_correlated <- t_most_correlated[t_most_correlated$Column1 != t_most_correlated$Column2, ]
t_top_50 <- t_most_correlated[order(-t_most_correlated$Correlation), ][1:50, ]
# some of the most interesting correlations
print("The bottom quartile of schools had a large correlation between high needs students and economically disadvantaged students. As a matter of fact, it is only the top quartile that doesn't see as large of a correlation between high needs students and economically disadvantaged students.")
## [1] "The bottom quartile of schools had a large correlation between high needs students and economically disadvantaged students. As a matter of fact, it is only the top quartile that doesn't see as large of a correlation between high needs students and economically disadvantaged students."
print("Another interesting note is that the bottom and top quartile both share strong correlations between their reading/writing SAT scores and their math SAT scores, whereas the middle 50% only sees a high correlation between the reading and writing scores.")
## [1] "Another interesting note is that the bottom and top quartile both share strong correlations between their reading/writing SAT scores and their math SAT scores, whereas the middle 50% only sees a high correlation between the reading and writing scores."
#plot the histogram of amount of white vs. nonwhite schools
white <- education %>% subset(X..White >= 50)
nonwhite <- education %>% subset(X..White < 50)
whitehist <- ggplot(education, group = fill) +
geom_histogram(data = white, aes(x = X..White, fill = "White Schools" ), color = "black", position = "dodge", alpha = 0.5, bins = 30) +
geom_histogram(data = nonwhite, aes(x = X..White, fill = "Non-White Schools"), color = "black", position = "dodge", alpha = 0.5, bins = 30) +
labs(title = "Proportion of White Students in Schools",
x = "Proportion of White Students",
y = "Frequency") +
scale_fill_manual(values = c("White Schools" = "darkgreen", "Non-White Schools" = "gold"),
labels = c("Non-White Schools", "White Schools")) +
theme_minimal()
ggplotly(whitehist)
# Visualizing the relationship between columns of interest
bottom_plot <- ggplot(bottom_25_percent,
aes(y = X..Graduated, x = X..High.Needs)) +
geom_point(aes(color = X..Economically.Disadvantaged, size = bottom_25_percent$Average.Class.Size)) +
labs(y = "Graduation Rate", x = "High Needs", color = "Economically Disadvantaged", size = "Class Size") +
ggtitle("Graduation Rate related to High Needs, Economically Disadvantaged, and Class Size") +
theme(plot.title = element_text(size=18)) +
scale_color_gradient(low = "orange", high = "purple") +
theme(plot.title = element_text(size = 4)) +
scale_size_continuous(range = c(2, 8)) + theme_minimal()
bottom_plot_interactive <- ggplotly(bottom_plot)
bottom_plot_interactive
# Visualizing the relationship between columns of interest
top_plot <-
ggplot(top_25_percent,
aes(x = X..High.Needs, y = X..Graduated)) +
geom_point( aes(size = Average.Class.Size, color = X..Economically.Disadvantaged)) +
ggtitle("Graduation Rate related to High Needs, Economically Disadvantaged, and Class Size") +
labs(x = "High Needs", y = "Graduation Rate", color = "Economically Disadvantaged", size = "Class Size") +
scale_size_continuous(range = c(1, 8)) +
scale_color_gradient(low = "orange", high = "purple") +
theme_minimal()
top_plot_interactive <- ggplotly(top_plot)
top_plot_interactive